Latent Variables active in Random Forests

First we get the metaviper predictions, LV scores, and Random Forest weights from Synapse

#get immune predictions
dtab<-synapser::synTableQuery(paste('select * from',mp_scores))$asDataFrame()%>%
  subset(isCellLine!='TRUE')
## 
Create CSV FileHandle [##########----------]50.13%   79091/157768       
Create CSV FileHandle [####################]100.00%   157768/157768   Done...    
Downloading  [#-------------------]7.43%   2.0MB/26.9MB (1.5MB/s) Job-100530171440920083890700450.csv     
Downloading  [###-----------------]14.85%   4.0MB/26.9MB (1.7MB/s) Job-100530171440920083890700450.csv     
Downloading  [####----------------]22.28%   6.0MB/26.9MB (1.7MB/s) Job-100530171440920083890700450.csv     
Downloading  [######--------------]29.71%   8.0MB/26.9MB (1.8MB/s) Job-100530171440920083890700450.csv     
Downloading  [#######-------------]37.13%   10.0MB/26.9MB (1.8MB/s) Job-100530171440920083890700450.csv     
Downloading  [#########-----------]44.56%   12.0MB/26.9MB (1.8MB/s) Job-100530171440920083890700450.csv     
Downloading  [##########----------]51.99%   14.0MB/26.9MB (1.8MB/s) Job-100530171440920083890700450.csv     
Downloading  [############--------]59.41%   16.0MB/26.9MB (1.8MB/s) Job-100530171440920083890700450.csv     
Downloading  [#############-------]66.84%   18.0MB/26.9MB (1.8MB/s) Job-100530171440920083890700450.csv     
Downloading  [###############-----]74.27%   20.0MB/26.9MB (1.9MB/s) Job-100530171440920083890700450.csv     
Downloading  [################----]81.69%   22.0MB/26.9MB (1.8MB/s) Job-100530171440920083890700450.csv     
Downloading  [##################--]89.12%   24.0MB/26.9MB (1.9MB/s) Job-100530171440920083890700450.csv     
Downloading  [###################-]96.54%   26.0MB/26.9MB (2.0MB/s) Job-100530171440920083890700450.csv     
Downloading  [####################]100.00%   26.9MB/26.9MB (2.0MB/s) Job-100530171440920083890700450.csv Done...
##get metaviper scores
mtab<-synapser::synTableQuery(paste('select * from',metaviper_scores))$asDataFrame()
## 
Building the CSV... [##------------------]12.04%   176681/1467984       
Building the CSV... [####----------------]18.82%   276300/1467984       
Building the CSV... [#######-------------]32.92%   483256/1467984       
Building the CSV... [########------------]39.81%   584370/1467984       
Building the CSV... [#########-----------]46.83%   687480/1467984       
Building the CSV... [####################]100.00%   1467984/1467984   Done...    
Downloading  [--------------------]2.47%   2.0MB/81.0MB (1.5MB/s) Job-100530342590765623229243657.csv     
Downloading  [#-------------------]4.94%   4.0MB/81.0MB (1.7MB/s) Job-100530342590765623229243657.csv     
Downloading  [#-------------------]7.40%   6.0MB/81.0MB (1.8MB/s) Job-100530342590765623229243657.csv     
Downloading  [##------------------]9.87%   8.0MB/81.0MB (1.8MB/s) Job-100530342590765623229243657.csv     
Downloading  [##------------------]12.34%   10.0MB/81.0MB (1.8MB/s) Job-100530342590765623229243657.csv     
Downloading  [###-----------------]14.81%   12.0MB/81.0MB (1.8MB/s) Job-100530342590765623229243657.csv     
Downloading  [###-----------------]17.27%   14.0MB/81.0MB (1.8MB/s) Job-100530342590765623229243657.csv     
Downloading  [####----------------]19.74%   16.0MB/81.0MB (1.9MB/s) Job-100530342590765623229243657.csv     
Downloading  [####----------------]22.21%   18.0MB/81.0MB (1.8MB/s) Job-100530342590765623229243657.csv     
Downloading  [#####---------------]24.68%   20.0MB/81.0MB (1.9MB/s) Job-100530342590765623229243657.csv     
Downloading  [#####---------------]27.14%   22.0MB/81.0MB (2.0MB/s) Job-100530342590765623229243657.csv     
Downloading  [######--------------]29.61%   24.0MB/81.0MB (2.0MB/s) Job-100530342590765623229243657.csv     
Downloading  [######--------------]32.08%   26.0MB/81.0MB (2.1MB/s) Job-100530342590765623229243657.csv     
Downloading  [#######-------------]34.55%   28.0MB/81.0MB (2.1MB/s) Job-100530342590765623229243657.csv     
Downloading  [#######-------------]37.01%   30.0MB/81.0MB (2.2MB/s) Job-100530342590765623229243657.csv     
Downloading  [########------------]39.48%   32.0MB/81.0MB (2.3MB/s) Job-100530342590765623229243657.csv     
Downloading  [########------------]41.95%   34.0MB/81.0MB (2.3MB/s) Job-100530342590765623229243657.csv     
Downloading  [#########-----------]44.42%   36.0MB/81.0MB (2.4MB/s) Job-100530342590765623229243657.csv     
Downloading  [#########-----------]46.88%   38.0MB/81.0MB (2.4MB/s) Job-100530342590765623229243657.csv     
Downloading  [##########----------]49.35%   40.0MB/81.0MB (2.5MB/s) Job-100530342590765623229243657.csv     
Downloading  [##########----------]51.82%   42.0MB/81.0MB (2.6MB/s) Job-100530342590765623229243657.csv     
Downloading  [###########---------]54.29%   44.0MB/81.0MB (2.6MB/s) Job-100530342590765623229243657.csv     
Downloading  [###########---------]56.76%   46.0MB/81.0MB (2.6MB/s) Job-100530342590765623229243657.csv     
Downloading  [############--------]59.22%   48.0MB/81.0MB (2.7MB/s) Job-100530342590765623229243657.csv     
Downloading  [############--------]61.69%   50.0MB/81.0MB (2.7MB/s) Job-100530342590765623229243657.csv     
Downloading  [#############-------]64.16%   52.0MB/81.0MB (2.8MB/s) Job-100530342590765623229243657.csv     
Downloading  [#############-------]66.63%   54.0MB/81.0MB (2.8MB/s) Job-100530342590765623229243657.csv     
Downloading  [##############------]69.09%   56.0MB/81.0MB (2.9MB/s) Job-100530342590765623229243657.csv     
Downloading  [##############------]71.56%   58.0MB/81.0MB (2.9MB/s) Job-100530342590765623229243657.csv     
Downloading  [###############-----]74.03%   60.0MB/81.0MB (2.9MB/s) Job-100530342590765623229243657.csv     
Downloading  [###############-----]76.50%   62.0MB/81.0MB (3.0MB/s) Job-100530342590765623229243657.csv     
Downloading  [################----]78.96%   64.0MB/81.0MB (3.0MB/s) Job-100530342590765623229243657.csv     
Downloading  [################----]81.43%   66.0MB/81.0MB (3.0MB/s) Job-100530342590765623229243657.csv     
Downloading  [#################---]83.90%   68.0MB/81.0MB (3.1MB/s) Job-100530342590765623229243657.csv     
Downloading  [#################---]86.37%   70.0MB/81.0MB (3.1MB/s) Job-100530342590765623229243657.csv     
Downloading  [##################--]88.83%   72.0MB/81.0MB (3.2MB/s) Job-100530342590765623229243657.csv     
Downloading  [##################--]91.30%   74.0MB/81.0MB (3.2MB/s) Job-100530342590765623229243657.csv     
Downloading  [###################-]93.77%   76.0MB/81.0MB (3.3MB/s) Job-100530342590765623229243657.csv     
Downloading  [###################-]96.24%   78.0MB/81.0MB (3.3MB/s) Job-100530342590765623229243657.csv     
Downloading  [####################]98.70%   80.0MB/81.0MB (3.3MB/s) Job-100530342590765623229243657.csv     
Downloading  [####################]100.00%   81.0MB/81.0MB (3.4MB/s) Job-100530342590765623229243657.csv Done...
##get rf loadings
rftab<-synapser::synTableQuery(paste('select * from',rf_mp))$asDataFrame()%>%
  select(LV_Full,`Cutaneous Neurofibroma`,`Neurofibroma`,`Malignant Peripheral Nerve Sheath Tumor`,`Plexiform Neurofibroma`)%>%
  mutate(latent_var=gsub('`','',LV_Full))%>%
  select(-LV_Full)
## 
 [####################]100.00%   1/1   Done...    
Downloading  [####################]100.00%   79.8kB/79.8kB (387.7kB/s) Job-10053071691687645443407267.csv Done...
samps<-intersect(dtab$specimenID,mtab$specimenID)

mp_res<-dtab%>%
  subset(specimenID%in%samps)%>%
  group_by(latent_var) %>%
  mutate(sd_value = sd(value)) %>%
  filter(sd_value > 0.05) %>%
  ungroup()%>%
  select(latent_var,value,specimenID,sd_value,diagnosis)%>%
  left_join(rftab,by='latent_var')

combined<-mp_res%>%ungroup()%>%inner_join(mtab,by='specimenID')

#now compute some basic stats
mp_stats<-mp_res%>%
  rowwise()%>%mutate(MaxVal=max(`Cutaneous Neurofibroma`,`Plexiform Neurofibroma`,`Malignant Peripheral Nerve Sheath Tumor`,Neurofibroma))%>%
  rowwise()%>%
  mutate(MeanVal=mean(c(`Cutaneous Neurofibroma`,`Plexiform Neurofibroma`,`Malignant Peripheral Nerve Sheath Tumor`,Neurofibroma)))

DT::datatable(mp_stats)

We can now see which latent variables seem to be ‘active’ in various random forests. Either uniquely for each tumor type or across the board. Let’s take the top 10 across the various tumor types as well as the highest mean value

cols=c("Cutaneous Neurofibroma", "Neurofibroma", 'Malignant Peripheral Nerve Sheath Tumor',"Plexiform Neurofibroma","MeanVal")

top10<-do.call(cbind,lapply(cols,function(x){
  nrf<-rename(mp_stats,dis=x)%>%
    select(dis,latent_var)%>%distinct()%>%
    arrange(desc(dis))%>%select(latent_var)
  nrf[1:10,1]
}))
names(top10)<-cols

DT::datatable(top10)

Plotting protein correlations

With the top 10 most impactful LVs for each random forest prediction, we can plot those metaviper proteins that correlate with them.

corVals=combined%>%subset(latent_var%in%unique(unlist(top10)))%>%
    group_by(latent_var,gene)%>%
  summarize(corVal=cor(value,metaviperscore,use='pairwise.complete.obs'),numSamps=n_distinct(specimenID))

corVals
## # A tibble: 146,208 x 4
## # Groups:   latent_var [24]
##    latent_var               gene    corVal numSamps
##    <chr>                    <chr>    <dbl>    <int>
##  1 1,REACTOME_MRNA_SPLICING AATF    0.531        81
##  2 1,REACTOME_MRNA_SPLICING ABCA1  -0.549        81
##  3 1,REACTOME_MRNA_SPLICING ABCC8  -0.144        81
##  4 1,REACTOME_MRNA_SPLICING ABCC9  -0.590        81
##  5 1,REACTOME_MRNA_SPLICING ABCG1  -0.647        81
##  6 1,REACTOME_MRNA_SPLICING ABCG4   0.108        81
##  7 1,REACTOME_MRNA_SPLICING ABI1   -0.503        81
##  8 1,REACTOME_MRNA_SPLICING ABL1   -0.0134       81
##  9 1,REACTOME_MRNA_SPLICING ABL2   -0.173        81
## 10 1,REACTOME_MRNA_SPLICING ABLIM3 -0.680        81
## # … with 146,198 more rows
##now how do we bracket them?
##plot correlation distributions by cell type and method.
require(ggplot2)

##first re-order variables to plot
top.df<-top10%>%rename(All='MeanVal')%>%gather(key="disease",value="pathway")

p<-corVals%>%
              ungroup()%>%
  subset(latent_var%in%unique(unlist(top10)))%>%
          #    mutate(LatentVariable = stringr::str_trim(as.character(latent_var), 20))%>%
              ggplot()+geom_boxplot(aes(x=latent_var,y=corVal))+ theme(axis.text.x = element_text(angle = 45, hjust = 1))+ggtitle("Correlation of metaviper proteins with lv")
print(p)

There are some proteins that show up as highly correlated. By choosing a threshold, we can evaluate what they are in more detail.

These plots represent the top 10 latent variables for a predictor of each tumor type and the proteins that are correlated with them.

corthresh=0.75

##now filter to the cell types with correlated proteins
cor_cell_types=subset(corVals,corVal>corthresh)%>%
  subset(latent_var%in%unique(unlist(top10)))%>%
      ungroup()%>%
  select(latent_var)%>%
  distinct()

print(paste('we found',nrow(cor_cell_types),'lvs with some protein correlation greater than',corthresh))
## [1] "we found 19 lvs with some protein correlation greater than 0.75"
DT::datatable(cor_cell_types)
apply(cor_cell_types,1,function(x){
  ct=x[['latent_var']]
#  m=x[['method']]

  #for each gene and cell type
  genes=subset(corVals,latent_var==ct)%>%
        subset(corVal>corthresh)%>%
   arrange(desc(corVal))%>%
      ungroup()

    if(nrow(genes)>12){
    new.corthresh=format(genes$corVal[12],digits=3)
    genes=genes[1:12,]
  }else{
    new.corthresh=corthresh
  }

  scores=subset(combined,gene%in%genes$gene)%>%subset(latent_var==ct)

  p2<- ggplot(scores)+
      geom_point(aes(x=value,y=metaviperscore,
          col=gene,shape=tumorType))+
  #  scale_x_log10()+
      ggtitle(paste(ct,'correlation >',new.corthresh,'\n',
        subset(top.df,pathway==ct)%>%select(disease)%>%unlist()%>%paste(collapse=',')))
  print(p2)
 # ggsave(paste0(m,'predictions of',gsub(" ","",gsub("/","",ct)),'cor',new.corthresh,'.pdf'))
})

## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

#parentid='syn20710537'
#for(fi in list.files('.')[grep('tions',list.files('.'))])
#  synapser::synStore(synapser::File(fi,parentId=parentid,annotations=list(resourceType='analysis',isMultiSpecimen='TRUE',isMultiIndividual='TRUE')),used=c(deconv_scores,metaviper_scores),executed=this.script)